/*******************************************************************************
********************************************************************************
********** Clean data received from LAUSD for ZOC Project ********************** 
********************************************************************************
********************************************************************************
*  Created by: Chris Campos 2019-08-04

*  Consists of 
*  1. Location and demographics (2012-2019 end year) - DONE
*  2. CST Scores (2012-2016 end year) - DONE
*  3. SBAC Scores (2015-2018 end year)  - DONE
*  4. GPA (2012-2019 end year) only for 6th-12th grade - DONE
*  5. Yearly attendance   - DONE 
*  6. SAT Scores - Done 
*  7. Suspensions - Done 
*  8. Grades in STEM, ELA, social studies, elementary school marks - DONE  
*  9. UC/CSU preparedness - DONE   
*  10. Geographic information, student census blocks -- DONE
*
*  Runtime: 60 minutes
* ****************************************************************************/ 

set trace off
set tracedepth 2
timer clear 
timer on 1

capture program drop main 
program define main

	paths 


	cleanDemographics
	cleanCST
	cleanSBAC
	cleanGPA
	
	cleanUC_CSU
	cleanSuspensions
	cleanSAT

	cleanAttendance
	cleanGrades

	mergeScoresAndMigrants
	mergeALL

	prepAddressesForBuild
	mergeAddresses 
	mergeBoundaries

end 

capture program drop paths 
program define paths 
	global dir "DIRECTORY1"
	global datadir "DIRECTORY2"
	global builddir "$datadir/build"


	global demographics "$datadir/demos"
	global cstdata "$datadir/cst"
	global sbacdata "$datadir/sbac"
	global attendancedata "$datadir/attendance"
	global gpadata "$datadir/gpa"
	global ucdata "$datadir/uccsu"
	global grades "$datadir/grades"
	global suspensions "$datadir/suspensions"
	global sat "$datadir/sat"
	
	global savedir $dir/dataSTATA
	global figures $dir/figures 

	
	
end 


capture program drop cleanDemographics
program define cleanDemographics
	capture log close 
	log using $builddir/logs/cleanDemographics.log  , replace 
	clear 
	tempfile locationmaster 
	save `locationmaster', emptyok
	forvalues year = 2002/2021{
		* Append all datasets 
		import delimited "$demographics/demo2002.csv", clear
		tostring studentspedflag, replace 
		tostring studentgiftedprogramdescription, replace 
		append using `locationmaster'
		save `locationmaster', replace 	
	}

	
	* Ensure ID
	* 112 person obs that are dups -- not a significant amount so just randomly drop 
	duplicates drop studentpseudoid endyear, force 
	isid studentpseudoid endyear 
	save $builddir/demographics2002_2021.dta, replace 
	cap log close 

end 



capture program drop cleanCST 
program define cleanCST 
	cap log close 
	cap log using $builddir/logs/cleanCST.log, replace 
	clear 
	tempfile cstmaster 
	save `cstmaster', emptyok 
	
	forvalues year = 2003/2016{
		* CST X.csv means that data for exams taken in year X, which corresponds to academic year X-1 with endyear X. For example,
		* exams taken in 2003, correspond to academic year 2002-2003 with endyear 2003. 
		import delimited "$cstdata/CST `year'.csv", clear 
		tostring gradecode, replace 
		gen endyear = `year'
		* Count obs 
		tab endyear
		* tab exams taken in year 
		tab testname 
		format studentpseudoid %20.0g
		append using `cstmaster'
		save `cstmaster', replace 
	}
	
	

	drop if studentpseudoid==.
	drop if cstscaledscoreamount==.
	
	
	* Only keep math and ela
	gen ela = regexm(testname, "ENGLISH")
	gen math = regexm(testname, "MATH")
	replace math = 1 if regexm(testname, "ALGEBRA")
	replace math = 1 if regexm(testname, "GEOMETRY")
	keep if math == 1 | ela == 1
	
	* reshape 
	gen subject = "ELA" if ela==1
	replace subject = "MATH" if math==1
	drop ela math testexamname exclusioncode
	* Some students (5000) took same exam multiple times in one year -- keep max score 
	bys studentpseudoid schoolyear gradecode endyear subject: egen maxScore = max(cstscaledscoreamount )
	keep if cstscaledscoreamount == maxScore 
	* One student remains as dup 
	duplicates drop studentpseudoid schoolyear gradecode endyear subject, force 
	drop maxScore 

	rename caperformanceleveldescription proficiency
	rename cstscaledscoreamount cst
	reshape wide cst proficiency testname, i(studentpseudoid schoolyear gradecode endyear) j(subject) string
	drop if testnameMATH =="" | testnameELA ==""
	duplicates drop studentpseudoid endyear , force 
	isid studentpseudoid endyear 

	* standardize test scores 
	bys gradecode endyear : egen meanELA = mean(cstELA)
	bys gradecode endyear : egen meanMATH = mean(cstMATH)
	bys gradecode endyear : egen sdELA = sd(cstELA)
	bys gradecode endyear : egen sdMATH = sd(cstMATH)

	gen z_math = (cstMATH - meanMATH)/sdMATH
	gen z_ela = (cstELA - meanELA)/sdELA

	save $builddir/cst_grade2_11_2002_2012.dta, replace 
	cap log close
end 




capture program drop cleanSBAC 
program define cleanSBAC 

	cap log close 
	log using $builddir/logs/cleanSBAC.log, replace 
	clear
	tempfile sbacmaster 
	save `sbacmaster', emptyok 
	
	import delimited "$sbacdata/sbac2015.csv", clear 
	tostring gradecode, replace 
	gen endyear = 2015
	append using `sbacmaster'
	save `sbacmaster', replace

	import delimited "$sbacdata/sbac2016.csv", clear 
	tostring gradecode, replace 
	gen endyear = 2016
	append using `sbacmaster'
	save `sbacmaster', replace

	import delimited "$sbacdata/sbac2017.csv", clear 
	tostring gradecode, replace 
	gen endyear = 2017
	append using `sbacmaster'
	save `sbacmaster', replace

	import delimited "$sbacdata/sbac2018.csv", clear 
	tostring gradecode, replace 
	gen endyear = 2018
	append using `sbacmaster'
	save `sbacmaster', replace

	import delimited "$sbacdata/sbac2019.csv", clear 
	tostring gradecode, replace 
	gen endyear = 2019
	append using `sbacmaster'
	save `sbacmaster', replace


	import delimited "$sbacdata/sbac2021.csv", clear 
	tostring gradecode, replace 
	gen endyear = 2021
	append using `sbacmaster'
	save `sbacmaster', replace


	rename overallscalescore sbac 
	rename overallperformancelevel performance 
	reshape wide sbac performance, i(studentpseudoid endyear schoolyear gradecode ) j(testgroupname) string


	* standardize test scores 
	bys gradecode endyear : egen meanELA = mean(sbacELA)
	bys gradecode endyear : egen meanMATH = mean(sbacMath)
	bys gradecode endyear : egen sdELA = sd(sbacELA)
	bys gradecode endyear : egen sdMATH = sd(sbacMath)

	gen z_math_sbac = (sbacMath - meanMATH)/sdMATH
	gen z_ela_sbac = (sbacELA - meanELA)/sdELA
	drop gradecode meanELA meanMATH sdELA sdMATH

	save ${builddir}/sbac_grade3_8_11_2015_2021.dta, replace 
end 







capture program drop cleanSuspensions 
program define cleanSuspensions

	import delimited "$suspensions/suspensions.csv", clear 
	*rename ïendyear endyear 
	collapse (sum) ofsuspendeddays (count) incidents = ofsuspendeddays, by(studentpseudoid endyear)
	label var ofsuspendeddays "Number of suspended days in school year "
	label var incidents "Number of suspension incidents in school year"
	save $builddir/suspensions2002_2018.dta, replace  
end 

capture program drop cleanGPA
program define cleanGPA
	cap log close 
	cap log using $builddir/logs/cleanGPA.log, replace 
	* Make a panel of gpa for grades 6-12 , 2002-2018
	clear 
	tempfile gpamaster_612 
	save `gpamaster_612', emptyok
	* This is end year - so looping through end years 
	forvalues yr = 2002/2021{
		import delimited "$gpadata/GPA `yr'.csv", clear 
		gen semester = "FALL" if occurencename=="END FALL"
		replace semester = "SPRING" if occurencename=="END SPRING"
		* Drop obs that shouldn't be there 
		if `yr' >2014{
			keep if inrange(gradecode, 6,12	)
		}
		else{
			drop if inlist(gradecode, "1", "5", "K" , "U", "UNK", "K", "PK")
			destring gradecode , replace 
			keep if inrange(gradecode, 6,12	)
		}
		gen endyear = `yr'
		rename augmentedcumulativegpa gpa
		keep studentpseudoid gpa semester endyear 
		reshape wide gpa , i(studentpseudoid endyear) j(semester) string 
		append using `gpamaster_612'
		save `gpamaster_612', replace 
	}
	format studentpseudoid %20.0g

	save $builddir/gpa2002_2021_gr6_12.dta, replace  

	cap log close 
end 

capture program drop cleanSAT 
program define cleanSAT 
syntax, [data(string)]
	tempfile masterSAT
	clear 
	save `masterSAT', emptyok 
	forvalues year = 2007/2011{
		import delimited "$sat/Sat `year'.txt", clear 
		cap destring write, replace
		rename std_pseudo_id studentpseudoid
		keep studentpseudoid math verbal write 
		gen endyear = `year'
		append using `masterSAT' 
		save `masterSAT', replace 

	}
	import excel "$sat/Sat 2012.xlsx", sheet("sat1112") firstrow  case(lower) clear
	rename std_pseudo_id studentpseudoid
	keep studentpseudoid math verbal write 
	gen endyear = 2012 
	append using `masterSAT'
	save `masterSAT', replace 
	import excel "$sat/SAT 2013.xlsx", sheet("sat1213") firstrow case(lower) clear
	keep studentpseudoid math verbal write 
	gen endyear = 2013 
	append using `masterSAT'
	save `masterSAT', replace 
	import excel "$sat/SAT 2014.xlsx", sheet("sat1314") firstrow case(lower) clear
	keep studentpseudoid math verbal write 
	gen endyear = 2014
	append using `masterSAT'
	save `masterSAT', replace
	import excel "$sat/SAT 2015.xlsx", sheet("sat1415") firstrow case(lower) clear
	keep studentpseudoid math verbal write 
	gen endyear = 2015
	append using `masterSAT'
	save `masterSAT', replace
	import excel "$sat/SAT 2016.xlsx", sheet("sat2015_2016_20160722") firstrow clear case(lower)	
	rename subv verbal 
	rename subm math
	keep studentpseudoid math verbal write  
	gen endyear = 2016
	append using `masterSAT'
	save `masterSAT', replace
	import excel "$sat/SAT 2017.xlsx", sheet("sat2015_2016_20160722") firstrow clear case(lower)
	rename subv verbal 
	rename subm math
	destring write, replace 
	keep studentpseudoid math verbal write  
	gen endyear = 2017
	append using `masterSAT'
	save `masterSAT', replace

	* For students who took exam multiple times in one year, assign average of the exams
	bys studentpseudoid endyear: egen meanMath =mean(math)
	bys studentpseudoid endyear: egen meanWrite= mean(write)
	bys studentpseudoid endyear: egen meanVerbal = mean(verbal)
	drop math verbal write 
	collapse (mean) math = meanMath write = meanWrite verbal = meanVerbal, by(studentpseudoid endyear)

	save $builddir/sat2007_2017.dta, replace

end 


capture program drop cleanUC_CSU
program define cleanUC_CSU


	tempfile masterUC
	clear 
	save `masterUC', emptyok
	
	* Data ID is studentid, endyear, occurencename, academicrequirementcode 
	* We want to keep only Spring occurrences - tells us how many UC/CSU credits have been accrued up to that point
	* We also want to change from long to wide format w.r.t academic requirement code e.g. CSU or UC 	
	forvalues year = 2004/2018{
		import delimited "$ucdata/uc`year'.csv", clear 
		*rename ïendyear endyear 
		if `year' ==2017 | `year'==2018 {
			keep if occurencename=="SPRING"
		}
		else{
			keep if occurencename =="END SPRING"
		}
		keep endyear studentpseudoid classof agstatusabbreviateddescription agstatustypecode agsubjectname agcreditscomplete academicrequirementcode	
		rename agstatusabbreviateddescription agstatusdesc
		reshape wide agstatusdesc agcreditscomplete agstatustypecode , i(studentpseudoid endyear) j(academicrequirementcode ) string
		label var agstatusdescUC "UC - AG Status Abbreviated Description"
		label var agcreditscompleteUC "UC - AG Credits Complete"
		label var agstatustypecodeUC "UC - AG Status Type Code" 
		label var agstatusdescCSU "CSU - AG Status Abbreviated Description"
		label var agcreditscompleteCSU "CSU - AG Credits Complete"
		label var agstatustypecodeCSU "CSU - AG Status Type Code" 
		drop agsubjectname
		cap destring classof, replace 
		append using `masterUC'
		save `masterUC', replace 

	}

	save $builddir/uc_csu_2004_2018.dta, replace 
end 

capture program drop cleanAttendance 
program define cleanAttendance

	clear 
	tempfile attendancemaster
	save `attendancemaster', emptyok
	cap log close 
	cap log using $builddir/logs/cleanAttendance.log, replace 
	* For 2002-2009, attendance data in two separate files each year - elementary and secondary 
	* Attendance data missing for elementary 2002, 2003, for now 
	* Data ID is: studentpseudoid endyear preferredlocationcode 
	* In other words, if a student changes schools within a year they will have multiple entries 

	import delimited  using "$attendancedata/Secondary Attendance 2002.csv", clear 
	gen ytdofabsentdays = appabsencecount + otherabsencecount
	rename (daysenrolledcount daysattendancecount ) (ytdenrolleddays ytdattendeddays)
	*rename ïendyear endyear
	* Drop observations where the student was not enrolled in any days at that school
	drop if ytdenrolleddays == 0
	* Some students have two entries for a given school they attended. Collapse to make unique for each school
	bys studentpseudoid endyear preferredlocationcode: gen schoolStuCount = _N
	collapse (sum) ytdofabsentdays ytdenrolleddays ytdattendeddays, by(studentpseudoid  endyear preferredlocationcode)
	bys studentpseudoid endyear: gen numSchoolsAttended = _N 
	gen switchedSchools = numSchoolsAttended >1
	* Collapse to the student level, documenting switchers and the number of schools attended 
	collapse (sum) ytdofabsentdays ytdenrolleddays ytdattendeddays (mean) numSchoolsAttended switchedSchools, by(studentpseudoid  endyear) 
	keep studentpseudoid 	endyear ytd*  numSchoolsAttended switchedSchools
	append using `attendancemaster'
	save `attendancemaster', replace 

	import delimited  using "$attendancedata/Secondary Attendance 2003.csv", clear 
	gen ytdofabsentdays = appabsencecount + otherabsencecount
	rename (daysenrolledcount daysattendancecount ) (ytdenrolleddays ytdattendeddays)
	*rename ïendyear endyear
	* Drop observations where the student was not enrolled in any days at that school
	drop if ytdenrolleddays == 0
	* Some students have two entries for a given school they attended. Collapse to make unique for each school
	collapse (sum) ytdofabsentdays ytdenrolleddays ytdattendeddays, by(studentpseudoid endyear  preferredlocationcode)
	bys studentpseudoid endyear: gen numSchoolsAttended = _N 
	gen switchedSchools = numSchoolsAttended >1
	* Collapse to the student level, documenting switchers and the number of schools attended 
	collapse (sum) ytdofabsentdays ytdenrolleddays ytdattendeddays (mean) numSchoolsAttended switchedSchools, by(studentpseudoid  endyear) 
	keep studentpseudoid  endyear ytd*  numSchoolsAttended switchedSchools
	append using `attendancemaster'
	save `attendancemaster', replace 

	* Do elementary and secondary for 2004-2009 
	forvalues year = 2004/2009{
		import delimited  using "$attendancedata/Secondary Attendance `year'.csv", clear 
		gen ytdofabsentdays = appabsencecount + otherabsencecount
		rename (daysenrolledcount daysattendancecount ) (ytdenrolleddays ytdattendeddays)
		*rename ïendyear endyear
		* Drop observations where the student was not enrolled in any days at that school
		drop if ytdenrolleddays == 0
		* Some students have two entries for a given school they attended. Collapse to make unique for each school
		collapse (sum) ytdofabsentdays ytdenrolleddays ytdattendeddays, by(studentpseudoid endyear  preferredlocationcode)
		bys studentpseudoid endyear: gen numSchoolsAttended = _N 
		gen switchedSchools = numSchoolsAttended >1
		* Collapse to the student level, documenting switchers and the number of schools attended 
		collapse (sum) ytdofabsentdays ytdenrolleddays ytdattendeddays (mean) numSchoolsAttended switchedSchools, by(studentpseudoid  endyear) 
		keep studentpseudoid endyear ytd*  numSchoolsAttended switchedSchools
		append using `attendancemaster'
		save `attendancemaster', replace 

		import delimited  using "$attendancedata/Elementary Attendance `year'.csv", clear 
		rename absentdaysquantity ytdofabsentdays 
		rename presentdaysquantity ytdattendeddays
		gen ytdenrolleddays = ytdattendeddays + ytdofabsentdays
		*rename ïendyear endyear
		* Drop observations where the student was not enrolled in any days at that school
		drop if ytdenrolleddays == 0
		* Some students have two entries for a given school they attended. Collapse to make unique for each school
		collapse (sum) ytdofabsentdays ytdenrolleddays ytdattendeddays, by(studentpseudoid endyear  preferredlocationcode)
		bys studentpseudoid endyear: gen numSchoolsAttended = _N 
		gen switchedSchools = numSchoolsAttended >1
		* Collapse to the student level, documenting switchers and the number of schools attended 
		collapse (sum) ytdofabsentdays ytdenrolleddays ytdattendeddays (mean) numSchoolsAttended switchedSchools, by(studentpseudoid  endyear) 	
		keep studentpseudoid endyear ytd* numSchoolsAttended switchedSchools
		append using `attendancemaster'		
		save  `attendancemaster', replace 
	}
	* Every year, some students (less than 70) in the EL data are also in the SEC data. Aggregate their attendance 
	collapse (sum) ytdofabsentdays ytdenrolleddays ytdattendeddays (mean) numSchoolsAttended switchedSchools, by(studentpseudoid  endyear) 	
	isid studentpseudoid endyear
	save `attendancemaster', replace 

	* Do for 2010-2018 
	forvalues year = 2010/2021{
		import delimited "$attendancedata/Attendance YTD `year'.csv", clear
		rename ytdofenrolleddays ytdenrolleddays 
		rename ytdofattendeddays ytdattendeddays 
		gen endyear = `year'
		keep studentpseudoid endyear ytdenrolleddays ytdattendeddays ytdofabsentdays
		append using `attendancemaster' 
		save `attendancemaster', replace 
	}
	isid studentpseudoid endyear 
	save $builddir/attendance_2002_2021.dta, replace
	cap log close
end 

* The purpose of this subprogram is to track students courses taken in Science, ELA, Math
capture program drop cleanGrades 
program define cleanGrades

	cap log close 
	cap log using $builddir/logs/cleanGrades.log, replace 
	tempfile masterGrades
	clear  
	save `masterGrades', emptyok

	forvalues year = 2002/2021{
		import delimited "$grades/grades`year'.csv", clear 
		keep if departmentname =="ENGLISH" | departmentname =="MATHEMATICS" | departmentname =="SOCIAL SCIENCE" | departmentname == "SCIENCE"
		drop if preferredlocationname=="UNKNOWN"

		duplicates drop occurencename studentpseudoid departmentname coursename, force

		destring coursecode, replace 
		* Want to tag how many of each type of student took in a semester 
		sort occurencename studentpseudoid departmentname coursename 
		bys occurencename studentpseudoid departmentname: gen courseNum = _n

		gen honors = honor_course_flag=="Y"
		gen ap = apcourseflag =="Y"
		*drop if courseNum >2 
		label var courseNum "Counter for number of courses taken by student within department"
		gen grade_points = 4 if mark_for_achievement == "A" & ap==0
		replace grade_points = 3 if mark_for_achievement== "B" & ap==0
		replace grade_points = 2 if mark_for_achievement=="C" & ap==0
		replace grade_points = 1 if mark_for_achievement =="D" & ap==0
		replace grade_points = 0 if mark_for_achievement=="F" & ap==0
		replace grade_points = 5 if mark_for_achievement=="A" & ap==1
		replace grade_points = 4 if mark_for_achievement== "B" & ap==1
		replace grade_points = 3 if mark_for_achievement=="C" & ap==1
		replace grade_points = 2 if mark_for_achievement =="D" & ap==1
		replace grade_points = 1 if mark_for_achievement=="F" & ap==1

	
		* calculate gpa 
		bys occurencename studentpseudoid departmentname endyear: egen gpa = mean(grade_points)
		bys occurencename studentpseudoid departmentname endyear: gen numCourses = _N 
		bys occurencename studentpseudoid departmentname endyear: egen numAPCourses = total(ap)
		bys occurencename studentpseudoid departmentname endyear: egen numHonorsCourses = total(honors)
		* number of ap and honors courses taken in a year 
		bys studentpseudoid endyear: egen ytd_ap = total(ap)
		bys studentpseudoid endyear: egen ytd_honors = total(honors)

		collapse (mean) gpa numCourses numAPCourses numHonorsCourses ytd_ap ytd_honors, by(studentpseudoid endyear departmentname occurencename)
		replace departmentname = "_" + departmentname
		replace departmentname = "_SS" if departmentname == "_SOCIAL SCIENCE"
		replace departmentname = "_ELA" if departmentname == "_ENGLISH"
		replace departmentname = "_MATH" if departmentname == "_MATHEMATICS"
		replace departmentname = "_SCI" if departmentname=="_SCIENCE"
		gen semester_subject = occurencename  + departmentname
		drop occurencename departmentname
		reshape wide gpa numCourses numAPCourses numHonorsCourses, i(studentpseudoid endyear) j(semester_subject) string  
		label var ytd_ap "Number of AP courses taken in academic year"
		label var ytd_honors "Number of honors courses taken in academic year"
		local subjects "SS ELA MATH SCI"
		local semesters "FALL SPRING"

		foreach semester of local semesters{
			foreach sub of local subjects{
				if "`sub'" == "SS" local subject = "Social Science"
				if "`sub'" == "SCI" local subject = "Science"
				if "`sub'" == "MATH" local subject = "Math"
				if "`sub'" == "ELA" local subject = "ELA"
				label var gpa`semester'_`sub' "`semester' Semester GPA for `subject' courses"
				label var numCourses`semester'_`sub' "Total # `subject' courses taken in `semester'"
				label var numAPCourses`semester'_`sub' "Total # `subject' AP courses taken in `semester'"
				label var numHonorsCourses`semester'_`sub' "Total # `subject' Honors courses taken in `semester'"
			}
		}

		append using `masterGrades'
		save `masterGrades', replace
	}
	compress
	save $builddir/courses_grades_2002_2021.dta, replace  
	cap log close 
end 



capture program drop mergeScoresAndMigrants
program define mergeScoresAndMigrants 

	set seed 1234 
	* 941 dups in the sbac data 
	use ${builddir}/sbac_grade3_8_11_2015_2021.dta, clear 
	duplicates drop studentpseudoid endyear , force 
	tempfile sbac 
	save `sbac'

	use $builddir/demographics2002_2021.dta, clear 
	merge 1:1 studentpseudoid endyear using $builddir/cst_grade2_11_2002_2012.dta, gen(mergeCST) keep(1 3) keepusing(testnameELA cstELA proficiencyELA testnameMATH cstMATH proficiencyMATH meanELA meanMATH sdELA sdMATH z_math z_ela)
	merge 1:1 studentpseudoid endyear using `sbac', gen(mergeSBAC) keep(1 3)
	gen z_math_all = z_math 
	replace z_math_all = z_math_sbac if mergeSBAC ==3
	gen z_ela_all = z_ela 
	replace z_ela_all = z_ela_sbac if mergeSBAC ==3
	rename studentbirthcountry birthcountry
	rename studentusschoolfirstattenddate firstUS
	rename studentpseudoid stuid

	* Identify migrants 
	replace birthcountry = strtrim(birthcountry)
	bys stuid: egen modeBirth = mode(birthcountry ), maxmode
	* Count how many individuals have multiple birth countries reported -- there are non
	count if birthcountry != modeBirth & birthcountry !=""
	* Clean birth country 
	gen birth_loc = modeBirth
	replace birth_loc = "USA" ///
		if modeBirth=="ALABAMA" | ///
			modeBirth=="ALASKA" | ///
			modeBirth=="ARKANSAS" | ///
			modeBirth=="ARIZONA" | ///
			modeBirth=="CALIFORNIA" | ///
			modeBirth=="CONNECTICUT" | ///
			modeBirth=="COLORADO" | ///
			modeBirth=="DELAWARE" | ///
			modeBirth=="DISTRT COLUMBIA" | ///
			modeBirth=="FLORIDA" | ///
			modeBirth=="GEORGIA" | ///
			modeBirth=="HAWAII" | ///
			modeBirth=="IDAHO" | ///
			modeBirth=="ILLINOIS" | ///
			modeBirth=="INDIANA" | ///
			modeBirth=="IOWA" | ///
			modeBirth=="KANSAS" | ///
			modeBirth=="KENTUCKY" | ///
			modeBirth=="LOUISIANA" | ///
			modeBirth=="MAINE" | ///
			modeBirth=="MARYLAND" | ///
			modeBirth=="MASSACHUSETTS" | ///
			modeBirth=="MINNESOTA" | ///
			modeBirth=="MISSISSIPPI" | ///
			modeBirth=="MISSOURI" | ///
			modeBirth=="MONTANA" | ///
			modeBirth=="MICHIGAN" | ///
			modeBirth=="NEBRASKA" | ///
			modeBirth=="NEVADA" | ///
			modeBirth=="NEW HAMPSHIRE" | ///
			modeBirth=="NEW JERSEY" | ///
			modeBirth=="NEW MEXICO" | ///
			modeBirth=="NEW YORK" | ///
			modeBirth=="NORTH CAROLINA" | ///
			modeBirth=="NORTH DAKOTA" | ///
			modeBirth=="OHIO" | ///
			modeBirth=="OKLAHOMA" | ///
			modeBirth=="OREGON" | ///
			modeBirth=="PENNSYLVANIA" | ///
			modeBirth=="RHODE ISLAND" | ///
			modeBirth=="SOUTH CAROLINA" | ///
			modeBirth=="SOUTH DAKOTA" | ///
			modeBirth=="TENNESSEE" | ///
			modeBirth=="TEXAS" | ///
			modeBirth=="UTAH" | ///
			modeBirth=="VERMONT" | ///
			modeBirth=="VIRGINIA" | ///
			modeBirth=="WASHINGTON" | ///
			modeBirth=="WEST VIRGINIA" | ///
			modeBirth=="WISCONSIN" | ///
			modeBirth=="WYOMING" 
	replace birth_loc ="USA" if modeBirth=="UNITED STATES"
	replace birth_loc = "USA" if modeBirth=="US"

	gen migrant_loc = birth_loc !="USA"
	label var migrant_loc "Migrant based on birth location"

	* Identify potential migrants based on date started school 
	gen firstUSDate = date(firstUS, "YMDhms")
	format firstUSDate %td
	gen firstYear = year(firstUSDate)
	label var firstYear "Year student first attended a US school"
	bys stuid: egen firstUSYear = min(firstYear)
	label var firstYear "Year student first attended a US school - constant for each student"
	* Assign ages 
	gen age = gradecode + 5 
	label var age "Age of student assuming they are six years old in sixth grade"
	* 1. tag the year they should have started first grade 
	*    - For those that appear in 1st grade, tag the year 
	*.   - For those not, tag the minimum year and input 
	gen startyear = endyear -1
	label var startyear "Year X of academic year X-Y e.g. 2014 of AY 2014-2015"
	bys stuid: egen minYear = min(startyear)
	bys stuid: egen minGrade = min(gradecode)
	bys stuid: egen minAge = min(age)
	label var minYear "First year we observe student in LAUSD"
	label var minGrade "First grade we observe student in LAUSD"
	label var minAge "First age we observe student in LAUSD"
	gen startschoolyear = minYear if minGrade ==1
	replace startschoolyear = minYear - 1 if minGrade==2
	replace startschoolyear = minYear - 2 if minGrade==3
	replace startschoolyear = minYear - 3 if minGrade==4
	replace startschoolyear = minYear - 4 if minGrade==5
	replace startschoolyear = minYear - 5 if minGrade==6
	replace startschoolyear = minYear - 6 if minGrade==7
	replace startschoolyear = minYear - 7 if minGrade==8
	replace startschoolyear = minYear - 8 if minGrade==9
	replace startschoolyear = minYear - 9 if minGrade==10
	replace startschoolyear = minYear - 10 if minGrade==11
	replace startschoolyear = minYear - 11 if minGrade==12
	label var startschoolyear "Year student should have started first grade"
	* If your first US school date year is after your imputed second grade year, then tag is potential migrant
	gen migrant = migrant_loc
	gen migrant_year = firstUSYear > startschoolyear + 1 if migrant==1
	label var migrant_year "Migrants identified by US first school start date and birth country"
	gen year_born = startschoolyear - 6 
	tab year_born 
	label var year_born "Year student was born based on assumption of being 6 in first grade"
	gen age_migrate = firstUSYear - year_born if migrant==1 
	replace age_migrate = 0 if age_migrate <0 & migrant==1
	gen sped = studentspedflag == "Y" if endyear >2003
	gen poverty = studentpovertyindicator == "Y"
	gen female = gendercode=="F"

	label var female "Indicator for female student"
	label var poverty "Indicator for being flagged as impoverished by LAUSD"
	label var sped "Indicator for being in special education"
	label var age_migrate "Age migrated to Los Angeles"
	*label var iso3 "Country labels"


	* Drop cst/sbac variables we don't need anymore
	drop meanELA meanMATH sdELA sdMATH 

	* Label CST/SBAC variables
	label var z_ela "Grade-year normalized CST score in ELA"
	label var z_math "Grade-year normalized CST score in Math"
	label var z_ela_sbac "Grade-year normalized SBAC score in ELA"
	label var z_math_sbac "Grade-year normalized SBAC score in Math"
	label var z_ela_all "Grade-year normalized CST/SBAC scores in ELA - interrupted in 2014"
	label var z_math_all "Grade-year normalized CST/SBAC scores in Math - interrupted in 2014"
	label var migrant "Indicator for being a migrant to the US"
	drop modeBirth
	label var birth_loc "Clean birth country"


	save $builddir/cleanScoresDemo2002_2021.dta, replace 
end 



capture program drop mergeALL 
program define mergeALL

	use $builddir/cleanScoresDemo2002_2021.dta, clear 
	rename stuid studentpseudoid

	merge 1:1 studentpseudoid endyear using $builddir/suspensions2002_2018.dta, gen(mergeSuspensions) keep(1 3)
	tab endyear if mergeSuspensions==3
	merge 1:1 studentpseudoid endyear using $builddir/gpa2002_2021_gr6_12.dta, gen(mergeGPA) keep(1 3)
	tab endyear if mergeGPA==3
	merge 1:1 studentpseudoid endyear using $builddir/sat2007_2017.dta, gen(mergeSAT) keep(1 3)
	tab endyear if mergeSAT==3
	merge 1:1 studentpseudoid endyear using $builddir/uc_csu_2004_2018.dta, gen(mergeUC) keep(1 3)
	tab endyear if mergeUC==3
	merge 1:1 studentpseudoid endyear using $builddir/attendance_2002_2021.dta, gen(mergeAttendance) keep(1 3)
	tab endyear if mergeAttendance==3
	merge 1:1 studentpseudoid endyear using $builddir/courses_grades_2002_2021.dta, gen(mergeGrades) keep(1 3)
	tab endyear if mergeGrades==3

	sort studentpseudoid endyear 
	order studentpseudoid endyear gradecode schoolyear , first
	format studentpseudoid %15.0g


	* If student was not suspended in a year, they are not in suspension data 
	replace ofsuspendeddays = 0 if mergeSuspensions==1 
	replace incidents = 0 if mergeSuspensions==1
	drop if mergeSuspensions==2

	* GPA clean vars 
	label var gpaFALL "Fall GPA - Grades 6 through 12"
	label var gpaSPRING "Spring GPA - Grades 6 through 12"

	* SAT clean vars 
	rename (math write verbal) (sat_math sat_write sat_verbal)
	label var sat_math "Raw SAT Math score"
	label var sat_write "Raw SAT Writing score"
	label var sat_verbal "Raw SAT Verbal score"


	save $builddir/lausd2002_2021.dta, replace  

end 



capture program drop prepAddressesForBuild
program define prepAddressesForBuild
	clear
	use "$datadir/addresses_geocoded2001_2019_for_lazocs.dta", clear 
	format studentpseudoid %20.0g
	gsort studentpseudoid endyear - move_year -move_month -move_day 
	bys studentpseudoid: gen movenum = _n
	tostring x_block y_block, usedisplay replace
	keep studentpseudoid endyear GEOID10 x_block y_block  addr_city addr_zip movenum
	rename (endyear GEOID10 x_block y_block  ) ///
			(move_endyear blockid block_x block_y )
	reshape wide move_endyear blockid block_x block_y  addr_city addr_zip , i(studentpseudoid ) j(movenum)
	tempfile addresses 
	save `addresses'

	use "$builddir/lausd2002_2021.dta", clear 
	keep studentpseudoid endyear 
	*gen rand = runiform()
	*bys studentpseudoid: replace rand = rand[1]
	*keep if rand <.10
	*drop rand
	merge m:1 studentpseudoid using `addresses', gen(mergeAddresses) 
	erase `addresses'
	local address_vars "blockid   addr_city addr_zip"
	foreach v of local address_vars{
		gen `v' = ""
	}
	gen block_x = .
	gen block_y = .

	drop if mergeAddresses==2
	bys studentpseudoid: egen minYear = min(endyear)
	bys studentpseudoid: egen maxYear = max(endyear)
	gen move =0 
	* tag individuals who we observe a first address after the first year they are in our panel
	gen addressAfter = move_endyear1 >= minYear & !missing(move_endyear1) & !missing(minYear)
	gen addressBefore = move_endyear1 < minYear & !missing(move_endyear1) & !missing(minYear)

	* assign the first address we have for such individuals 
	local address_vars "blockid block_x block_y  addr_city addr_zip"
	foreach v of local address_vars{
		replace `v' = `v'1 if addressAfter==1 & endyear<=move_endyear1
		* For addresses we observe before student's first year in our data, assign closest to our first year
		forvalues move=1/12{
			replace `v' = `v'`move' if addressBefore==1 & move_endyear`move' < endyear
		}
		* make assignments for subsequent moves
		* make assignments for years before move that are still to be assigned
		* ensure the move occurs on or before the last year we observe the student
		* ensure we have a valid move date
		forvalues move = 2/12{
			replace `v' = `v'`move' if missing(`v') & endyear <=move_endyear`move' ///
					& move_endyear`move' <=maxYear & !missing(move_endyear`move')
			replace move = 1 if endyear==move_endyear`move' 
		}
	}
	* at this point, can drop all move-specific variables 
	drop addr_city1-addr_city12 addr_zip1-addr_zip12 move_endyear1-move_endyear12 blockid1-blockid12 
	* Tag the last year we have some recorded address information for student 
	gen max_assign_year = endyear if blockid!="" | addr_zip !=""
	bys studentpseudoid: egen max_address_year = max(max_assign_year)
	drop max_assign_year
	local address_vars "blockid   addr_city addr_zip block_x block_y"
	foreach v of local address_vars{
		gen last_`v' = `v' if max_address_year==endyear
		bys studentpseudoid: egen last_ass_`v' = mode(`v'), maxmode
		drop last_`v'
		rename last_ass_`v' last_`v'
	}
	* Make assignments now 
	foreach v of local address_vars{
		replace `v' = last_`v' if endyear >=max_address_year & mergeAddresses==3
		drop last_`v'
	}	
	drop max_address_year addressBefore addressAfter
	encode blockid, gen(censusblockid)
	drop blockid 
	encode addr_city, gen(stu_city) 
	drop addr_city
	encode addr_zip, gen(stu_zip)
	drop addr_zip
	drop minYear maxYear
	label define mergeAddresses 1 "No address information for this student" 3 "Address information available"
	label values mergeAddresses mergeAddresses
	replace move = . if mergeAddresses==1
	* count how many observations we end up not having any information for that were actually merged 
	count if mergeAddresses ==3 & (stu_zip ==. & censusblockid==.)
	label var censusblockid "Student address: census block ID"
	label var stu_city "Student address: city"
	label var stu_zip "Student address: zip"
	label var block_x "Student address: census block longitude"
	label var block_y "Student address: census block latitude"
	label var move "Indicator if student moved within academic year"
	label var mergeAddresses "Did we merge address information? (Constant within student)"
	compress 
	save $builddir/student_addresses_clean.dta, replace  
end



capture program drop mergeAddresses
program define mergeAddresses

	* Compress data before merging 
	use $builddir/lausd2002_2021.dta, clear 

	* drop unnecessary variables 
	drop timename occurencesequencenumber rankbyoccseqdecode ///
			occseqdecode ofstudents testexamname exclusionflag
			  
	* Encode to reduce size of data 
	encode preferredlocationname, gen(prefname)
	drop preferredlocationname
	rename prefname preferredlocationname
	order preferredlocationname, after(preferredlocationcode)
	encode localdistrictcode, gen(distcode)
	drop localdistrictcode
	rename distcode localdistrictcode
	order localdistrictcode, after(schoolyear)
	encode testnameELA, gen(testELA)
	encode testnameMATH, gen(testMATH)
	drop testnameELA testnameMATH
	rename (testELA testMATH) (testnameELA testnameMATH)
	order testnameMATH testnameELA, after(studentpovertyindicator)
	encode proficiencyELA , gen(profELA)
	encode proficiencyMATH , gen(profMATH)
	drop proficiencyMATH proficiencyELA
	rename (profMATH profELA) (proficiencyMATH proficiencyELA)
	order proficiencyELA proficiencyMATH, after(cstMATH)
	encode performanceELA , gen(perfELA)
	encode performanceMath, gen(perfMATH)
	drop performanceMath performanceELA
	rename (perfELA perfMATH) (performanceELA performanceMATH)
	order performanceMATH performanceELA, after(sbacMath)

	gen english_at_home = homelanguagedescription =="ENGLISH"
	gen spanish_at_home = homelanguagedescription =="SPANISH"
	label var english_at_home "Speaks English at home"
	label var spanish_at_home "Speaks Spanish at home"
	encode homelanguagedescription, gen(homelang)
	drop homelanguagedescription
	rename homelang homelanguagedescription
	order english_at_home spanish_at_home homelanguagedescription sped female poverty, after(firstUS)
	drop studentspedflag studentpovertyindicator gendercode



	gen english_learner = languageclasscode =="LEP"
	encode languageclasscode, gen(langcode)
	drop languageclasscode
	rename langcode languageclasscode
	order languageclasscode english_learner, after(homelanguagedescription)

	encode ethnicitydescription, gen(ethnicity)
	drop ethnicitydescription
	order ethnicity, after(poverty)
	encode studentgiftedprogramdescription , gen(gifted)
	drop studentgiftedprogramdescription
	order gifted, after(ethnicity)

	encode birth_loc , gen(birth_location)
	drop birth_loc 
	rename birth_location birth_loc 
	order birth_loc, after(z_ela_all)

	encode parentedulevelname, gen(pe)
	drop parentedulevelname
	rename pe parentedulevelname
	order parentedulevelname, after(firstUS)

	gen attendancerate = ytdattendeddays/ytdenrolleddays 
	order attendancerate, after(ytdofabsentdays)
	label var attendancerate "YTD attendace rate"

	gen college = parentedulevelname ==1 | parentedulevelname ==3
	order college, after(parentedulevelname)
	label var college "Parents completed college"
	label var english_learner "Classified as English learner"

	merge 1:1 studentpseudoid endyear using $builddir/student_addresses_clean.dta, gen(mergeAddressesToBuild)
	drop mergeAddressesToBuild

	save $builddir/lausd2002_2021.dta, replace 
 end 

* Merge high school attendance zone boundary information
capture program drop mergeBoundaries 
program define mergeBoundaries
	import delimited "${datadir}/blocks_with_zones.csv", clear 
	drop v1 geometry v8 block_x block_y
	format blockid10 %20.0g	
	tempfile blocks 
	save `blocks'

	use $builddir/lausd2002_2021.dta, clear 
	decode censusblockid, gen(blockid10)
	destring blockid10, replace 
	format blockid10 %20.0g
	merge m:1 blockid10 using `blocks', gen(boundaryInfo)

	duplicates drop studentpseudoid endyear, force
	save $builddir/lausd2002_2021.dta, replace 
end 

main

timer off 1 

timer list 1 
